# R syntax - preparing data for analysis 

library(foreign)
library(dplyr, lib.loc = "M:/R/Lib3")
library(tidyr, lib.loc = "M:/R/Lib3")
library(zoo, lib.loc = "M:/R/Lib3")

set.seed(28101990)

#setting directory
setwd("H:/data/dep_interview")
data<- read.spss("data_dep_interview_limited_allcoh.sav", use.value.labels = TRUE)

full_dataset<-as.data.frame(data)
summary(full_dataset)
table(full_dataset$EBBSTKPEILINGNUMMER, full_dataset$PERIODE)


# creating a unique ID for each individual (based on the 3 rin variables)
full_dataset<-full_dataset[order(full_dataset$RINPERSOONS, full_dataset$RINPERSOON, full_dataset$RINPERSOON3,  full_dataset$maand),]

table(full_dataset$maand)

full_dataset$ID<-rep(1:235186, each=15)

full_dataset<-full_dataset%>% select(ID, everything())

# generating a contract var for LFS which only has 3 cat- perm, temp and other (first getting rid of individuals who had at any point temp with prospect for perm as there was different dep interviewing there)

table(full_dataset$EBBAFLPOSWRKFLEX1)
full_dataset$temp_prospect_perm<-0
full_dataset$temp_prospect_perm[full_dataset$EBBAFLPOSWRKFLEX1=="Uitzicht op vast, vaste uren"]<-1
table(full_dataset$temp_prospect_perm)

full_dataset$temp_prospect_perm_max<-with(full_dataset, ave(temp_prospect_perm, ID, FUN=function(f) max(f,na.rm=T)))
table(full_dataset$temp_prospect_perm_max)

full_dataset2<-subset(full_dataset, !(full_dataset$temp_prospect_perm_max==1))

table(full_dataset2$ZELFSTANDIGE1)
table(full_dataset2$CONTRACTSOORT)
full_dataset2$CONTRACTSOORT[full_dataset2$CONTRACTSOORT== "   "]<-NA
table(full_dataset2$CONTRACTSOORT)
full_dataset2$CONTRACTSOORT<-factor(full_dataset2$CONTRACTSOORT)
table(full_dataset2$EBBAFLPOSWRKFLEX1, full_dataset2$CONTRACTSOORT)


length(full_dataset2$EBBAFLPOSWRKFLEX1[!is.na(full_dataset2$EBBAFLPOSWRKFLEX1)])
length(full_dataset2$EbbWrkVastDnst1[!is.na(full_dataset2$EbbWrkVastDnst1)])
length(full_dataset2$CONTRACTSOORT[!is.na(full_dataset2$CONTRACTSOORT)])

table(full_dataset2$EBBAFLPOSWRKFLEX1, full_dataset2$ZELFSTANDIGE1, useNA = "always")

# O-permanent; B- temporary; N-other 
full_dataset2$CONTRACTLFS<-NA
full_dataset2$CONTRACTLFS[full_dataset2$EBBAFLPOSWRKFLEX1=="Vast dienstverband"|full_dataset2$EBBAFLPOSWRKFLEX1=="Vast dienstverband zonder vaste uren" ]<-"O"
full_dataset2$CONTRACTLFS[full_dataset2$EBBAFLPOSWRKFLEX1=="Meerj.tijdelijk dienstv.vaste uren"|full_dataset2$EBBAFLPOSWRKFLEX1=="Ov. tijdelijk dienstv. vaste uren"|full_dataset2$EBBAFLPOSWRKFLEX1=="Uitzendkracht" |full_dataset2$EBBAFLPOSWRKFLEX1=="Oproepkracht of invalkracht" |full_dataset2$EBBAFLPOSWRKFLEX1=="Tijdelijk dienstv. zonder vaste uren"]<-"B"
full_dataset2$CONTRACTLFS[full_dataset2$ZELFSTANDIGE1=="Is zelfstandige"|full_dataset2$EBBAFLPOSWRKFLEX1=="Geen werkkring aanwezig" ]<-"N"

table(full_dataset2$CONTRACTLFS)


prop.table(table(full_dataset2$CONTRACTLFS, full_dataset2$CONTRACTSOORT))

# generating contract lag for lfs and er and numeric contract variables where 1- perm, 2- temp and 3- other 

full_dataset2$CONTRACTSOORT_LAG<- lag(full_dataset2$CONTRACTSOORT,1)
full_dataset2$CONTRACTSOORT_LAG[full_dataset2$maand==1]<-NA

full_dataset2$CONTRACTSLFS_LAG<- lag(full_dataset2$CONTRACTLFS,3)
full_dataset2$CONTRACTSLFS_LAG[full_dataset2$maand==1 | full_dataset2$maand==2 | full_dataset2$maand==3]<-NA


full_dataset2$CONTRACTLFS_num[full_dataset2$CONTRACTLFS=="O"]<-1
full_dataset2$CONTRACTLFS_num[full_dataset2$CONTRACTLFS=="B"]<-2
full_dataset2$CONTRACTLFS_num[full_dataset2$CONTRACTLFS=="N"]<-3

table(full_dataset2$CONTRACTLFS, full_dataset2$CONTRACTLFS_num)

full_dataset2$CONTRACTLFS_LAG_num[full_dataset2$CONTRACTSLFS_LAG=="O"]<-1
full_dataset2$CONTRACTLFS_LAG_num[full_dataset2$CONTRACTSLFS_LAG=="B"]<-2
full_dataset2$CONTRACTLFS_LAG_num[full_dataset2$CONTRACTSLFS_LAG=="N"]<-3

table(full_dataset2$CONTRACTSLFS_LAG, full_dataset2$CONTRACTLFS_LAG_num)



full_dataset2$CONTRACTSOORT_num[full_dataset2$CONTRACTSOORT=="Contract voor onbepaalde tijd"]<-1
full_dataset2$CONTRACTSOORT_num[full_dataset2$CONTRACTSOORT=="Contract voor bepaalde tijd, stagiaires en uitzendkrachten"]<-2
full_dataset2$CONTRACTSOORT_num[full_dataset2$CONTRACTSOORT=="SrtIV=17, =DGA"]<-3

table(full_dataset2$CONTRACTSOORT, full_dataset2$CONTRACTSOORT_num)

full_dataset2$CONTRACTSOORT_LAG_num[full_dataset2$CONTRACTSOORT_LAG=="Contract voor onbepaalde tijd"]<-1
full_dataset2$CONTRACTSOORT_LAG_num[full_dataset2$CONTRACTSOORT_LAG=="Contract voor bepaalde tijd, stagiaires en uitzendkrachten"]<-2
full_dataset2$CONTRACTSOORT_LAG_num[full_dataset2$CONTRACTSOORT_LAG=="SrtIV=17, =DGA"]<-3

table(full_dataset2$CONTRACTSOORT_LAG, full_dataset2$CONTRACTSOORT_LAG_num)

# approx a job change var which is based on the date job started 
table(full_dataset2$EBBBEGDAT1)

full_dataset2$job_begin_date<-full_dataset2$EBBBEGDAT1

is.numeric(full_dataset2$SLEUTELEBB)

length(full_dataset2$job_begin_date[!is.na(full_dataset2$job_begin_date)])

full_dataset2$job_change<- NA
full_dataset2$job_change[full_dataset2$job_begin_date>lag(full_dataset2$SLEUTELEBB,3)]<-1 # job change occured 
full_dataset2$job_change[full_dataset2$job_begin_date<=lag(full_dataset2$SLEUTELEBB,3)]<-0 # no job change occured
full_dataset2$job_change[full_dataset2$maand==1| full_dataset2$maand==2 | full_dataset2$maand==3]<-1


table(full_dataset2$job_change)
table(full_dataset2$job_change, full_dataset2$maand, useNA = "always")


# generating a combined var of job change and lag LFS contract (dummy var: 1- no job change and lag LFS temp; 0 otherwise)

full_dataset2$eligible_DI<-NA
full_dataset2$eligible_DI[full_dataset2$job_change==1 | full_dataset2$CONTRACTLFS_LAG_num!=2]<-0 # NOT eligible for DI; INDI
full_dataset2$eligible_DI[full_dataset2$job_change==0 & full_dataset2$CONTRACTLFS_LAG_num==2 ]<-1 # eligible for DI

table(full_dataset2$eligible_DI, full_dataset2$maand, useNA = "always")

# creating a dummy for those who started LFS in 2009 (1) and those who started in 2010 (0)

full_dataset2$start2009<-NA

full_dataset2$start2009[(full_dataset2$PERIODE==201001 | full_dataset2$PERIODE==201002 | full_dataset2$PERIODE==201003 |full_dataset2$PERIODE==201004 | full_dataset2$PERIODE==201005 | full_dataset2$PERIODE==201006 | full_dataset2$PERIODE==201007 | full_dataset2$PERIODE==201008 | full_dataset2$PERIODE==201009 |full_dataset2$PERIODE==201010 | full_dataset2$PERIODE==201011 | full_dataset2$PERIODE==201012 )& full_dataset2$maand==1]<-0 # start in 2010 
full_dataset2$start2009[(full_dataset2$PERIODE==200901 | full_dataset2$PERIODE==200902 | full_dataset2$PERIODE==200903 |full_dataset2$PERIODE==200904 | full_dataset2$PERIODE==200905 | full_dataset2$PERIODE==200906 |full_dataset2$PERIODE==200907 | full_dataset2$PERIODE==200908 | full_dataset2$PERIODE==200909 | full_dataset2$PERIODE==200910 | full_dataset2$PERIODE==200911 | full_dataset2$PERIODE==200912)& full_dataset2$maand==1]<-1 # start in 2009

full_dataset2$start2009_final<-with(full_dataset2, ave(start2009, ID, FUN=function(f) max(f,na.rm=T)))

table(full_dataset2$start2009_final, full_dataset2$EBBSTKPEILINGNUMMER)
list(full_dataset2$SLEUTELEBB[full_dataset2$start2009_final==1])
list(full_dataset2$SLEUTELEBB[full_dataset2$start2009_final==0])


table(full_dataset2$start2009_final, full_dataset2$maand, useNA = "always")
table(full_dataset2$start2009_final, full_dataset2$job_change, useNA = "always") # important 
table(full_dataset2$start2009_final, full_dataset2$job_change, full_dataset2$maand, useNA = "always")

table(full_dataset2$eligible_DI, full_dataset2$start2009_final)

# creating a DI variable with 3 cat- 1- had INDI (eligible_DI=0); 0- would have had DI (eligible_DI=1 & start2009=0); 2- had DI (eligible_DI=1 & start2009=1)

full_dataset2$DI<-NA
full_dataset2$DI[full_dataset2$eligible_DI==0]<-1
full_dataset2$DI[full_dataset2$eligible_DI==1 & full_dataset2$start2009_final==0]<-0
full_dataset2$DI[full_dataset2$eligible_DI==1 & full_dataset2$start2009_final==1]<-2
table(full_dataset2$DI, full_dataset2$eligible_DI)
table(full_dataset2$DI, full_dataset2$start2009_final)

# Preparing covariates

#gender
table(full_dataset2$EBBHHBGESLACHT, full_dataset2$EBBAFLJAAR, useNA="always") 
full_dataset2$EBBHHBGESLACHT_begin[full_dataset2$maand==1]<-full_dataset2$EBBHHBGESLACHT[full_dataset2$maand==1]
full_dataset2$gender<-with(full_dataset2, ave(EBBHHBGESLACHT_begin, ID, FUN=function(f) max(f,na.rm=T)))
full_dataset2$gender[full_dataset2$gender==-Inf]<-NA
table(full_dataset2$gender, full_dataset2$maand)

#education
table(full_dataset2$SOI3HB, full_dataset2$EBBAFLJAAR, useNA="always")
full_dataset2$SOI3HB_begin[full_dataset2$maand==1]<-full_dataset2$SOI3HB[full_dataset2$maand==1]
full_dataset2$education<-with(full_dataset2, ave(SOI3HB_begin, ID, FUN=function(f) max(f,na.rm=T)))
full_dataset2$education[full_dataset2$education==-Inf]<-NA
table(full_dataset2$education, full_dataset2$maand)

#nationality
table(full_dataset2$HERKOMSTLA5, full_dataset2$EBBAFLJAAR, useNA="always")
full_dataset2$HERKOMSTLA5_begin[full_dataset2$maand==1]<-full_dataset2$HERKOMSTLA5[full_dataset2$maand==1]
full_dataset2$nationality<-with(full_dataset2, ave(HERKOMSTLA5_begin, ID, FUN=function(f) max(f,na.rm=T)))
full_dataset2$nationality[full_dataset2$nationality==-Inf]<-NA
table(full_dataset2$nationality, full_dataset2$maand)

#age
table(full_dataset2$EBBAFLLFT, full_dataset2$EBBAFLJAAR, useNA="always")
hist(full_dataset2$EBBAFLLFT)

#restricting analysis to those individuals who were between 25 and 55 at the first wave of lfs
full_dataset2$age_begin<-0
full_dataset2$age_begin[full_dataset2$maand==1]<-full_dataset2$EBBAFLLFT[full_dataset2$maand==1]
full_dataset2$age<-with(full_dataset2, ave(age_begin, ID, FUN=function(f) max(f,na.rm=T)))
table(full_dataset2$age, full_dataset2$maand)

full_dataset3<-subset(full_dataset2, (full_dataset2$age>=25 & full_dataset2$age<=55))
hist(full_dataset3$age)
summary(full_dataset3$age)


#only keeping lfs obs and var's used for analysis

table(full_dataset3$CONTRACTSOORT_LAG, full_dataset3$CONTRACTSOORT_LAG_num)
table(full_dataset3$EBBSTKPEILINGNUMMER)
only_lfs<-subset(full_dataset3, (full_dataset3$EBBSTKPEILINGNUMMER==1 | full_dataset3$EBBSTKPEILINGNUMMER==2 | full_dataset3$EBBSTKPEILINGNUMMER==3 | full_dataset3$EBBSTKPEILINGNUMMER==4 | full_dataset3$EBBSTKPEILINGNUMMER==5 ))

only_lfs<-only_lfs[order(only_lfs$ID, only_lfs$maand),]

length(full_dataset3$ID)
table(full_dataset3$maand)

only_lfs$month<-rep(seq(from=1, to=5, by=1), 86075)

table(only_lfs$month)

only_lfs$month_cat<-only_lfs$month
table(only_lfs$month_cat)

dataset_subset<-subset(only_lfs, select= c(ID, month, month_cat, gender, age, nationality, education, start2009_final, eligible_DI, CONTRACTLFS_num, CONTRACTLFS_LAG_num, CONTRACTSOORT_num, CONTRACTSOORT_LAG_num, DI))

# saving as csv file to run analysis in LatentGold

write.table(dataset_subset, "H:/data/dep_interview/dep_interview_data_lfs_allcoh_new.csv", quote=FALSE, na=".", row.names = FALSE, sep="\t")
